{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!/usr/bin/python\n",
    "# -*- coding:utf-8 -*-\n",
    "\n",
    "import jieba,os,re\n",
    "from gensim import corpora, models, similarities\n",
    "\n",
    "\"\"\"创建停用词列表\"\"\"\n",
    "def stopwordslist():\n",
    "    stopwords = [line.strip() for line in open('./stopwords.txt',encoding='UTF-8').readlines()]\n",
    "    return stopwords\n",
    "\n",
    "\"\"\"对句子进行中文分词\"\"\"\n",
    "def seg_depart(sentence):\n",
    "    sentence_depart = jieba.cut(sentence.strip())\n",
    "    stopwords = stopwordslist()\n",
    "    outstr = ''\n",
    "    for word in sentence_depart:\n",
    "        if word not in stopwords:\n",
    "            outstr += word\n",
    "            outstr += \" \"\n",
    "    # outstr：'黄蜂 湖人 首发 科比 带伤 战 保罗 加索尔 ...'       \n",
    "    return outstr\n",
    "\n",
    "\"\"\"如果文档还没分词，就进行分词\"\"\"\n",
    "if not os.path.exists('./cnews.train_jieba.txt'):\n",
    "    # 给出文档路径\n",
    "    filename = \"./cnews.train.txt\"\n",
    "    outfilename = \"./cnews.train_jieba.txt\"\n",
    "    inputs = open(filename, 'r', encoding='UTF-8')\n",
    "    outputs = open(outfilename, 'w', encoding='UTF-8')\n",
    "\n",
    "    # 把非汉字的字符全部去掉\n",
    "    # 将输出结果写入ouputs.txt中\n",
    "    for line in inputs:\n",
    "        line = line.split('\\t')[1]\n",
    "        line = re.sub(r'[^\\u4e00-\\u9fa5]+','',line)\n",
    "        line_seg = seg_depart(line.strip())\n",
    "        outputs.write(line_seg.strip() + '\\n')\n",
    "    \n",
    "    outputs.close()\n",
    "    inputs.close()\n",
    "    print(\"删除停用词和分词成功！！！\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10个主题的单词分布为：\n",
      "\n",
      "(0, '0.006*\"万\" + 0.006*\"中\" + 0.005*\"元\" + 0.004*\"高速\" + 0.003*\"句子\" + 0.003*\"市场\" + 0.003*\"公司\" + 0.003*\"约\" + 0.002*\"说\" + 0.002*\"时间\"')\n",
      "(1, '0.010*\"中\" + 0.005*\"做\" + 0.005*\"英语\" + 0.004*\"单词\" + 0.004*\"说\" + 0.003*\"镜头\" + 0.003*\"文章\" + 0.003*\"公司\" + 0.003*\"时间\" + 0.003*\"搭载\"')\n",
      "(2, '0.017*\"申购\" + 0.015*\"净值\" + 0.014*\"分红\" + 0.010*\"索尼\" + 0.009*\"公司\" + 0.007*\"中\" + 0.005*\"减持\" + 0.004*\"产品\" + 0.004*\"市场\" + 0.004*\"行业\"')\n",
      "(3, '0.013*\"考试\" + 0.010*\"基金\" + 0.006*\"中\" + 0.005*\"公司\" + 0.005*\"英寸\" + 0.005*\"万\" + 0.005*\"采用\" + 0.005*\"视频\" + 0.004*\"中国\" + 0.004*\"债券\"')\n",
      "(4, '0.089*\"基金\" + 0.015*\"投资\" + 0.014*\"市场\" + 0.011*\"股票\" + 0.010*\"公司\" + 0.009*\"中\" + 0.008*\"行业\" + 0.007*\"经理\" + 0.006*\"新\" + 0.006*\"经济\"')\n",
      "(5, '0.011*\"索尼\" + 0.008*\"快门\" + 0.008*\"万\" + 0.006*\"拍摄\" + 0.006*\"镜头\" + 0.005*\"高清\" + 0.005*\"小巧\" + 0.004*\"中\" + 0.003*\"短片\" + 0.003*\"性能\"')\n",
      "(6, '0.012*\"功能\" + 0.009*\"拍摄\" + 0.008*\"中\" + 0.008*\"采用\" + 0.008*\"玩家\" + 0.007*\"机身\" + 0.007*\"支持\" + 0.006*\"相机\" + 0.006*\"拥有\" + 0.005*\"游戏\"')\n",
      "(7, '0.024*\"分红\" + 0.014*\"基金\" + 0.012*\"赎回\" + 0.011*\"公司\" + 0.008*\"中\" + 0.004*\"市场\" + 0.004*\"新\" + 0.004*\"账户\" + 0.003*\"产品\" + 0.003*\"定向\"')\n",
      "(8, '0.060*\"基金\" + 0.012*\"市场\" + 0.011*\"公司\" + 0.009*\"投资\" + 0.008*\"中\" + 0.007*\"收益\" + 0.006*\"亿元\" + 0.005*\"投资者\" + 0.005*\"显示\" + 0.005*\"上涨\"')\n",
      "(9, '0.017*\"基金\" + 0.007*\"公司\" + 0.006*\"中\" + 0.005*\"中国\" + 0.005*\"说\" + 0.004*\"新能源\" + 0.003*\"经理\" + 0.003*\"市场\" + 0.003*\"企业\" + 0.003*\"新\"')\n"
     ]
    }
   ],
   "source": [
    "\"\"\"准备好训练语料，整理成gensim需要的输入格式\"\"\"\n",
    "fr = open('./cnews.train_jieba.txt', 'r',encoding='utf-8')\n",
    "train = []\n",
    "for line in fr.readlines():\n",
    "    line = [word.strip() for word in line.split(' ')]\n",
    "    train.append(line)\n",
    "    # train: [['黄蜂', '湖人', '首发', '科比', '带伤', '战',...],[...],...]\n",
    "    \n",
    "\"\"\"构建词频矩阵，训练LDA模型\"\"\"\n",
    "dictionary = corpora.Dictionary(train)\n",
    "# corpus[0]: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1),...]\n",
    "# corpus是把每条新闻ID化后的结果，每个元素是新闻中的每个词语，在字典中的ID和频率\n",
    "corpus = [dictionary.doc2bow(text) for text in train]\n",
    "\n",
    "lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10)\n",
    "topic_list = lda.print_topics(10)\n",
    "print(\"10个主题的单词分布为：\\n\")\n",
    "for topic in topic_list:\n",
    "    print(topic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\SIYI\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.222 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "这条体育新闻的主题分布为：\n",
      "\n",
      "[(1, 0.36818886), (3, 0.06355113), (5, 0.055832624), (9, 0.5021481)] \n",
      "\n",
      "这条娱乐新闻的主题分布为：\n",
      "\n",
      "[(1, 0.17199118), (3, 0.2358956), (6, 0.281208), (9, 0.3077155)] \n",
      "\n",
      "这条科技新闻的主题分布为：\n",
      "\n",
      "[(3, 0.51802427), (4, 0.19868204), (6, 0.27476338)] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "\"\"\"抽取新闻的主题\"\"\"\n",
    "# 用来测试的三条新闻，分别为体验、娱乐和科技新闻    \n",
    "file_test = \"./cnews.test.txt\"\n",
    "news_test = open(file_test, 'r', encoding='UTF-8')\n",
    "    \n",
    "test = []\n",
    "# 处理成正确的输入格式       \n",
    "for line in news_test:\n",
    "    line = line.split('\\t')[1]\n",
    "    line = re.sub(r'[^\\u4e00-\\u9fa5]+','',line)\n",
    "    line_seg = seg_depart(line.strip())\n",
    "    line_seg = [word.strip() for word in line_seg.split(' ')]\n",
    "    test.append(line_seg)    \n",
    "    \n",
    "# 新闻ID化    \n",
    "corpus_test = [dictionary.doc2bow(text) for text in test]\n",
    "# 得到每条新闻的主题分布\n",
    "topics_test = lda.get_document_topics(corpus_test)  \n",
    "labels = ['体育','娱乐','科技']\n",
    "for i in range(3):\n",
    "    print('这条'+labels[i]+'新闻的主题分布为：\\n')\n",
    "    print(topics_test[i],'\\n')\n",
    "\n",
    "fr.close()\n",
    "news_test.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
